﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections.Specialized;

namespace SDT.CrawlSystem.Servers.Filters
{
    public static class MSWordFilter
    {
        public static string CleanWordHtml(string html)
        {
            StringCollection sc = new StringCollection();
            sc.Add(@"<!--(\w|\W)+?-->");
            sc.Add(@"<title>(\w|\W)+?</title>");
            sc.Add(@"\s?class=\w+");
            sc.Add(@"\s+style='[^']+'");
            sc.Add(@"<(meta|link|/?o:|/?style|/?div|/?st\d|/?head|/?html|body|/?body|/?span|!\[)[^>]*?>");
            sc.Add(@"(<[^>]+>)+&nbsp;(</\w+>)+");
            sc.Add(@"\s+v:\w+=""[^""]+""");
            sc.Add(@"(\n\r){2,}");
            foreach (string s in sc)
            {
                html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);
            }
            return html;
        }

        public static string FixEntities(string html)
        {
            NameValueCollection nvc = new NameValueCollection();
            nvc.Add("\"", "&ldquo;");
            nvc.Add("\"", "&rdquo;");
            nvc.Add("–", "&mdash;");
            foreach (string key in nvc.Keys)
            {
                html = html.Replace(key, nvc[key]);
            }
            return html;
        }

    }
}
